Chapter 3 Data statistics

load("data/data.Rdata")

3.1 filter samples with high host data

sample_metadata <- sample_metadata%>%
  filter(!sample %in% c("EHI02721", "EHI02712", "EHI02700", "EHI02720", "EHI02749", "EHI02719", "EHI02729", "EHI02715", "EHI02722"))

read_counts <- read_counts %>% 
    select(one_of(c("genome",sample_metadata$sample)))%>%
  filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
  select_if(~!all(. == 0))

genome_metadata <- genome_metadata %>% 
  semi_join(., read_counts, by = "genome") %>% 
  arrange(match(genome,read_counts1$genome))

genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips

#load("data/genome_gifts.Rdata")

3.2 Sequencing reads statistics

sample_metadata %>% 
    summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
              sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
    tt()
tinytable_mlxrdgkl0941ux9o1nm6
Total Average
222.06 4.93 ± 1.08

3.3 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(sample_metadata, by = join_by(sample == sample)) %>%
    select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
    mutate(mags_bases = mags*146) %>%
    mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
    mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
    select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)

sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()
tinytable_etdpkjmhhsqk4wi4oa1o
Sample Low quality Mapped to host Unmapped Mapped to MAGs
EHI01861 0.2815721 0.011886522 1.2899665 2.94492935
EHI01963 0.2379339 0.377956197 0.7686364 2.26737109
EHI01964 0.3498799 2.739970327 0.5635412 1.32056898
EHI01965 0.3583169 0.398887483 1.3439432 3.82431144
EHI01969 0.2444974 0.026139904 0.7635264 2.53338864
EHI01971 0.3382550 0.014993899 1.0255034 3.94601821
EHI01972 0.4703327 0.054667824 1.3684974 5.49040300
EHI01973 0.3347793 0.064729480 1.5130592 4.84065102
EHI01974 0.3671355 0.052747090 0.8921925 2.95848005
EHI01975 0.4305111 0.123580711 1.9003492 5.31791948
EHI01976 0.3856202 0.011574334 1.2666400 3.26276449
EHI01977 0.4158680 0.747098387 1.7867623 3.87494103
EHI01978 0.3929906 0.004864216 1.2394723 3.63470402
EHI01980 0.3711274 0.013798033 1.7409090 5.56779249
EHI01981 0.3633649 0.020488667 1.2293197 3.03525532
EHI01983 0.2418940 0.146776195 0.8284920 2.84914941
EHI01984 0.4431283 0.007477209 1.2629388 5.76052943
EHI01986 0.3104557 0.016788774 0.9597449 3.38641569
EHI01987 0.4292930 0.001120181 0.9160842 3.32218036
EHI01988 0.2963428 0.197271999 1.0599471 3.92577283
EHI01990 0.3186345 0.010910854 1.0992821 3.87615385
EHI01991 0.3161992 0.021159611 1.1871737 2.82147205
EHI01993 0.2026808 0.006054774 0.5074436 2.59915566
EHI01994 0.3359364 0.064796021 0.9475828 3.88007790
EHI02699 0.2717760 0.007940738 1.1803471 3.05661322
EHI02700 0.3187046 3.815365599 0.7469281 0.43060671
EHI02701 0.2059134 0.135887208 0.9201707 2.92496093
EHI02705 0.2326028 0.134644975 1.0103929 3.05008177
EHI02707 0.2133975 0.197419011 1.0367591 2.56043485
EHI02708 0.3287951 0.247517459 1.4970876 3.72574889
EHI02710 0.2905179 0.004863423 1.1544139 3.25985646
EHI02712 0.3735243 4.162982418 0.5973263 0.12825939
EHI02714 0.2229262 0.166057029 0.7697399 3.24492767
EHI02715 0.3350685 3.050470506 0.9265001 0.43417816
EHI02718 0.2124597 1.622070570 0.4994065 1.33393017
EHI02719 0.3330425 2.607725990 1.8321792 0.33907536
EHI02720 0.3031357 3.089773324 0.7364466 0.19044999
EHI02721 0.3228625 3.677630914 0.6704712 0.08601152
EHI02722 0.3095520 2.799929187 0.7276464 0.41217450
EHI02723 0.2730593 0.113227422 0.9750562 3.21338043
EHI02724 0.3551211 0.121469083 0.9809314 3.69454095
EHI02728 0.3109121 1.043367740 0.9818139 2.94984474
EHI02729 0.3043389 2.024187975 1.5671125 0.47668212
EHI02749 0.4389880 4.177705067 0.7870159 0.53921786
EHI02751 0.4202354 3.712236730 0.7804933 1.77651793
sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
  tt()
tinytable_hjby3zht6rdxo65h9g6t
Low quality Mapped to host Unmapped Mapped to MAGs
0.3247485 0.9344047 1.063094 2.779287
sequence_fractions %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
  
    ggplot(., aes(x = sample, y = value, fill=fraction)) +
        geom_bar(position="stack", stat = "identity") +
      scale_fill_manual(name="Sequence type",
                    breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
        labs(x = "Samples", y = "Amount of data (GB)") +
        theme_classic() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

3.4 Recovered microbial fraction

singlem_table <- sequence_fractions %>%
    mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
    left_join(sample_metadata, by = join_by(sample == sample))  %>%
    mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
    select(sample,mags_proportion,singlem_proportion) %>%
    mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
    mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify

singlem_table %>%
    pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
    mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
    ggplot(., aes(x = value, y = sample, color=proportion)) +
            geom_line(aes(group = sample), color = "#f8a538") +
            geom_point() +
      scale_color_manual(name="Proportion",
                    breaks=c("mags_proportion","singlem_proportion"),
                    labels=c("Recovered","Estimated"),
                    values=c("#52e1e8", "#876b53"))+
            theme_classic() +
            labs(y = "Samples", x = "Prokaryotic fraction (%)") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "right")

damr <- singlem_table %>%
  mutate(damr=ifelse(is.na(singlem_proportion),100,mags_proportion/singlem_proportion*100)) %>%
  select(sample,damr)

damr %>% 
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  filter(diet=="Wild")%>% 
  summarise(mean=mean(damr),sd=sd(damr)) %>% 
  tt()
tinytable_y5ancqpf6htjzydnzwq4
mean sd
53.68476 25.59637
damr %>% 
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  filter(region=="Nafarroa")%>% 
  summarise(mean=mean(damr),sd=sd(damr)) %>% 
  tt()
tinytable_9bqbvq3wt3ucfh6ayxxu
mean sd
75.85542 4.081913
damr %>% 
  tt()
tinytable_q6bhndhusitu493eqwec
sample damr
EHI01861 69.54
EHI01963 74.68
EHI01964 70.09
EHI01965 74.00
EHI01969 76.84
EHI01971 79.37
EHI01972 80.05
EHI01973 76.19
EHI01974 76.83
EHI01975 73.67
EHI01976 72.04
EHI01977 68.44
EHI01978 74.57
EHI01980 76.18
EHI01981 71.17
EHI01983 77.47
EHI01984 82.02
EHI01986 77.92
EHI01987 78.39
EHI01988 78.74
EHI01990 77.91
EHI01991 70.38
EHI01993 83.67
EHI01994 80.37
EHI02699 72.14
EHI02700 36.57
EHI02701 76.07
EHI02705 75.12
EHI02707 71.18
EHI02708 71.34
EHI02710 73.85
EHI02712 17.68
EHI02714 80.83
EHI02715 31.91
EHI02718 72.76
EHI02719 15.62
EHI02720 20.55
EHI02721 11.37
EHI02722 36.16
EHI02723 76.72
EHI02724 79.02
EHI02728 75.03
EHI02729 23.32
EHI02749 40.66
EHI02751 69.48
damr %>% 
  filter(damr>95) %>%
  nrow()
[1] 0
damr %>% 
  summarise(mean=mean(damr),sd=sd(damr)) %>% 
  tt()
tinytable_cygwniwijxpnuynajyrl
mean sd
65.50911 20.77582